In Class Exercise 8 - Geographically Weighted Predictive Modelling

Author

Kieren Chua

Published

October 21, 2024

Modified

October 21, 2024

Part 1 : Load Data and Packages

pacman::p_load(sf, spdep, GWmodel, SpatialML, 
               tmap, rsample, Metrics, tidyverse,
               knitr, kableExtra)
mdata <- read_rds("data/mdata.rds")
set.seed(1234)
# Jitter the data

for (i in 1:nrow(mdata)) {
  coords <- st_coordinates(mdata[i, ])
  jittered_coords <- coords + runif(n = 1, min = -0.1, max = 0.1)
  mdata[i,]$geometry <- st_sfc(st_point(c(jittered_coords), dim = "XY"), crs = 3414)
}

resale_split <- initial_split(mdata, 
                              prop = 6.5/10,)
train_data <- training(resale_split)
test_data <- testing(resale_split)
mdata_nogeo <- mdata %>% st_drop_geometry()
ggstatsplot::ggcorrmat(mdata_nogeo[, 2:17])

Part 2 : Linear Regression

price_mlr <- lm(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                data=train_data)

# Prnt out a nice Report
olsrr::ols_regress(price_mlr)
                              Model Summary                                
--------------------------------------------------------------------------
R                           0.864       RMSE                    60891.465 
R-Squared                   0.746       MSE                3713159666.904 
Adj. R-Squared              0.746       Coef. Var                  14.068 
Pred R-Squared              0.746       AIC                    257079.714 
MAE                     47027.870       SBC                    257195.606 
--------------------------------------------------------------------------
 RMSE: Root Mean Square Error 
 MSE: Mean Square Error 
 MAE: Mean Absolute Error 
 AIC: Akaike Information Criteria 
 SBC: Schwarz Bayesian Criteria 

                                     ANOVA                                       
--------------------------------------------------------------------------------
                    Sum of                                                      
                   Squares           DF       Mean Square       F          Sig. 
--------------------------------------------------------------------------------
Regression    1.128008e+14           14      8.057201e+12    2169.904    0.0000 
Residual      3.831981e+13        10320    3713159666.904                       
Total         1.511206e+14        10334                                         
--------------------------------------------------------------------------------

                                               Parameter Estimates                                                 
------------------------------------------------------------------------------------------------------------------
                   model          Beta    Std. Error    Std. Beta       t        Sig          lower         upper 
------------------------------------------------------------------------------------------------------------------
             (Intercept)    116657.901     10562.315                  11.045    0.000     95953.716    137362.087 
          floor_area_sqm      2755.393        89.663        0.162     30.730    0.000      2579.635      2931.150 
            storey_order     14158.799       333.897        0.230     42.405    0.000     13504.297     14813.301 
    remaining_lease_mths       346.494         4.556        0.442     76.058    0.000       337.564       355.424 
                PROX_CBD    -17205.198       199.202       -0.593    -86.371    0.000    -17595.672    -16814.724 
        PROX_ELDERLYCARE    -13344.152       979.799       -0.074    -13.619    0.000    -15264.749    -11423.555 
             PROX_HAWKER    -19225.464      1267.105       -0.082    -15.173    0.000    -21709.235    -16741.693 
                PROX_MRT    -34996.049      1724.248       -0.112    -20.296    0.000    -38375.908    -31616.189 
               PROX_PARK     -5931.813      1455.400       -0.022     -4.076    0.000     -8784.680     -3078.947 
               PROX_MALL    -17450.175      1999.149       -0.052     -8.729    0.000    -21368.896    -13531.455 
        PROX_SUPERMARKET    -26563.411      4115.914       -0.034     -6.454    0.000    -34631.401    -18495.422 
WITHIN_350M_KINDERGARTEN      7846.864       625.644        0.066     12.542    0.000      6620.482      9073.247 
   WITHIN_350M_CHILDCARE     -4357.283       351.829       -0.071    -12.385    0.000     -5046.935     -3667.630 
         WITHIN_350M_BUS       886.959       221.590        0.021      4.003    0.000       452.600      1321.317 
       WITHIN_1KM_PRISCH     -8731.084       483.917       -0.110    -18.043    0.000     -9679.655     -7782.512 
------------------------------------------------------------------------------------------------------------------
# Also use package easystats for modelling visualization and reporting
vif <- performance::check_collinearity(price_mlr)
kable(vif, caption = "Variance Inflation Factor (VIF) Results") %>% kable_styling(font_size = 18)
Variance Inflation Factor (VIF) Results
Term VIF VIF_CI_low VIF_CI_high SE_factor Tolerance Tolerance_CI_low Tolerance_CI_high
floor_area_sqm 1.129928 1.107808 1.156586 1.062981 0.8850121 0.8646133 0.9026833
storey_order 1.197418 1.172317 1.226175 1.094266 0.8351303 0.8155443 0.8530114
remaining_lease_mths 1.375186 1.342965 1.410433 1.172683 0.7271745 0.7090019 0.7446210
PROX_CBD 1.915262 1.862374 1.971395 1.383930 0.5221217 0.5072551 0.5369492
PROX_ELDERLYCARE 1.195039 1.170038 1.223715 1.093178 0.8367930 0.8171838 0.8546730
PROX_HAWKER 1.194439 1.169464 1.223095 1.092904 0.8372129 0.8175978 0.8550926
PROX_MRT 1.243143 1.216156 1.273500 1.114963 0.8044123 0.7852373 0.8222629
PROX_PARK 1.202157 1.176857 1.231075 1.096429 0.8318384 0.8122981 0.8497208
PROX_MALL 1.422083 1.388040 1.459112 1.192511 0.7031939 0.6853483 0.7204403
PROX_SUPERMARKET 1.160541 1.137026 1.188092 1.077284 0.8616670 0.8416858 0.8794875
WITHIN_350M_KINDERGARTEN 1.121277 1.099572 1.147712 1.058903 0.8918406 0.8712983 0.9094449
WITHIN_350M_CHILDCARE 1.326777 1.296452 1.360204 1.151858 0.7537061 0.7351839 0.7713358
WITHIN_350M_BUS 1.151343 1.128237 1.178612 1.073006 0.8685509 0.8484557 0.8863383
WITHIN_1KM_PRISCH 1.526322 1.488261 1.567349 1.235444 0.6551699 0.6380201 0.6719251
# Anything > 5 would mean some collinear, more than 10 shoud just exclude
plot(vif) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
Variable `Component` is not in your data frame :/

Part 3 : Predictive Modeling

bw_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE +
                  WITHIN_1KM_PRISCH,
                  data=train_data,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Take a cup of tea and have a break, it will take a few minutes.
          -----A kind suggestion from GWmodel development group
Adaptive bandwidth: 6395 CV score: 3.530305e+13 
Adaptive bandwidth: 3960 CV score: 3.255536e+13 
Adaptive bandwidth: 2455 CV score: 2.880938e+13 
Adaptive bandwidth: 1524 CV score: 2.513316e+13 
Adaptive bandwidth: 950 CV score: 1.933083e+13 
Adaptive bandwidth: 593 CV score: 1.558231e+13 
Adaptive bandwidth: 375 CV score: 1.280445e+13 
Adaptive bandwidth: 237 CV score: 1.07939e+13 
Adaptive bandwidth: 155 CV score: 9.258612e+12 
Adaptive bandwidth: 101 CV score: 8.158468e+12 
Adaptive bandwidth: 71 CV score: 7.384932e+12 
Adaptive bandwidth: 49 CV score: 6.720673e+12 
Adaptive bandwidth: 38 CV score: 6.442666e+12 
Adaptive bandwidth: 29 CV score: Inf 
Adaptive bandwidth: 41 CV score: 6.527444e+12 
Adaptive bandwidth: 33 CV score: Inf 
Adaptive bandwidth: 38 CV score: 6.442666e+12 
gwr_adaptive <- gwr.basic(formula = resale_price ~
                            floor_area_sqm + storey_order +
                            remaining_lease_mths + PROX_CBD + 
                            PROX_ELDERLYCARE + PROX_HAWKER +
                            PROX_MRT + PROX_PARK + PROX_MALL + 
                            PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                            WITHIN_350M_CHILDCARE + WITHIN_1KM_PRISCH,
                          data=train_data,
                          bw=bw_adaptive, 
                          kernel = 'gaussian', 
                          adaptive=TRUE,
                          longlat = FALSE)
gwr_bw_test_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE +
                  WITHIN_1KM_PRISCH,
                  data=test_data,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Take a cup of tea and have a break, it will take a few minutes.
          -----A kind suggestion from GWmodel development group
Adaptive bandwidth: 3447 CV score: 1.990467e+13 
Adaptive bandwidth: 2138 CV score: 1.82957e+13 
Adaptive bandwidth: 1328 CV score: 1.617623e+13 
Adaptive bandwidth: 828 CV score: 1.417383e+13 
Adaptive bandwidth: 518 CV score: 1.121439e+13 
Adaptive bandwidth: 327 CV score: 9.053746e+12 
Adaptive bandwidth: 208 CV score: 7.571744e+12 
Adaptive bandwidth: 135 CV score: 6.593324e+12 
Adaptive bandwidth: 89 CV score: 5.817039e+12 
Adaptive bandwidth: 62 CV score: 5.312371e+12 
Adaptive bandwidth: 43 CV score: 4.845013e+12 
Adaptive bandwidth: 34 CV score: 4.64567e+12 
Adaptive bandwidth: 25 CV score: 4.542503e+12 
Adaptive bandwidth: 23 CV score: 4.571456e+12 
Adaptive bandwidth: 30 CV score: 4.561981e+12 
Adaptive bandwidth: 25 CV score: 4.542503e+12 
# Im still getting a singular matrix error despite testing and debugging
#gwr_pred <- gwr.predict(formula = resale_price ~
#                         floor_area_sqm + storey_order +
#                         remaining_lease_mths + PROX_CBD + 
#                         PROX_ELDERLYCARE + PROX_HAWKER + 
#                         PROX_MRT + PROX_PARK + PROX_MALL + 
#                         PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
#                         WITHIN_350M_CHILDCARE + 
#                         WITHIN_1KM_PRISCH, 
#                       data= train_data, 
#                       predictdata = test_data, 
#                       bw=bw_adaptive, 
#                       kernel = 'gaussian', 
#                       adaptive= FALSE, 
#                       longlat = FALSE)

Part 4 : Spatial-ML Methods

Get Coordinates Train-Test Split

coords <- st_coordinates(mdata)
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)

Remove Geometry

train_data_nogeom <- train_data %>% st_drop_geometry()

Do Random Forests

set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm + storey_order + 
               remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE + 
               PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + 
               PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
               WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + 
               WITHIN_1KM_PRISCH,
             data=train_data_nogeom)
rf
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths +      PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +      WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH,      data = train_data_nogeom) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      10335 
Number of independent variables:  14 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       708877837 
R squared (OOB):                  0.9515252 
set.seed(1234)
gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm + storey_order +
                       remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE +
                       PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL +
                       PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                       WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                       WITHIN_1KM_PRISCH,
                     dframe=train_data_nogeom, 
                     bw=55,
                     kernel="adaptive",
                     coords=coords_train)

Number of Observations: 10335
Number of Independent Variables: 14
Kernel: Adaptive
Neightbours: 55

--------------- Global ML Model Summary ---------------
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths +      PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +      WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH,      data = train_data_nogeom, num.trees = 500, mtry = 4, importance = "impurity",      num.threads = NULL) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      10335 
Number of independent variables:  14 
Mtry:                             4 
Target node size:                 5 
Variable importance mode:         impurity 
Splitrule:                        variance 
OOB prediction error (MSE):       672555845 
R squared (OOB):                  0.954009 

Importance:
          floor_area_sqm             storey_order     remaining_lease_mths 
            6.957558e+12             1.445835e+13             2.942620e+13 
                PROX_CBD         PROX_ELDERLYCARE              PROX_HAWKER 
            5.480924e+13             7.166729e+12             5.683696e+12 
                PROX_MRT                PROX_PARK                PROX_MALL 
            8.141868e+12             5.006469e+12             4.205909e+12 
        PROX_SUPERMARKET WITHIN_350M_KINDERGARTEN    WITHIN_350M_CHILDCARE 
            2.769281e+12             9.319988e+11             1.693840e+12 
         WITHIN_350M_BUS        WITHIN_1KM_PRISCH 
            1.604206e+12             7.118579e+12 

Mean Square Error (Not OOB): 165916724.251
R-squared (Not OOB) %: 98.865
AIC (Not OOB): 195640.509
AICc (Not OOB): 195640.556

--------------- Local Model Summary ---------------

Residuals OOB:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-216953.8  -14428.6     606.2     218.1   14941.9  227138.0 

Residuals Predicted (Not OOB):
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-82542.08  -3533.79     78.63     29.87   3746.36  63248.20 

Local Variable Importance:
                               Min          Max        Mean         StD
floor_area_sqm                   0 347702688833 16952096525 36996489828
storey_order             242142687 289172735788 16426959259 24298165408
remaining_lease_mths     440657099 579272913133 32619674467 65603975965
PROX_CBD                 112594042 340583744430 11604192564 25726882688
PROX_ELDERLYCARE         101476719 332688632806 10261577749 22659387882
PROX_HAWKER              103893583 299569897721  9905264951 20939125860
PROX_MRT                  98947793 269150464299 10276749397 23591023441
PROX_PARK                107229395 347121976193  9559739547 21906746329
PROX_MALL                105250860 437631810372 11595469504 28316050893
PROX_SUPERMARKET         112316285 342234698506  9831051060 24793192793
WITHIN_350M_KINDERGARTEN         0 219947190500  2793653229 13546728631
WITHIN_350M_CHILDCARE            0 210593671219  5081883890 17359704011
WITHIN_350M_BUS                  0 215473141324  4445498438 11521982207
WITHIN_1KM_PRISCH                0 179945964150  1716983401  7296404134

Mean squared error (OOB): 792305097.809
R-squared (OOB) %: 94.581
AIC (OOB): 211798.874
AICc (OOB): 211798.921
Mean squared error Predicted (Not OOB): 67123193.54
R-squared Predicted (Not OOB) %: 99.541
AIC Predicted (Not OOB): 186287.785
AICc Predicted (Not OOB): 186287.832

Calculation time (in seconds): 9.6303
test_data_nogeom <- cbind(test_data, coords_test) %>% st_drop_geometry()
gwRF_pred <- predict.grf(gwRF_adaptive,
                        test_data_nogeom,
                        x.var.name="X",
                        y.var.name="Y",
                        local.w=1,
                        global.w=0)

Compare against test data

GRF_pred_df <- as.data.frame(gwRF_pred)

test_data_pred <- cbind(test_data,
                        GRF_pred_df)

Show test prediction over and under predict

rmse(test_data_pred$resale_price, 
     test_data_pred$gwRF_pred)
[1] 28677.78
ggplot(data = test_data_pred,
       aes(x = gwRF_pred,
           y = resale_price)) +
  geom_point()

Show by location

test_data_pred$residuals <- test_data_pred$gwRF_pred - test_data_pred$resale_price
st_crs(test_data_pred)
Coordinate Reference System:
  User input: EPSG:3414 
  wkt:
PROJCRS["SVY21 / Singapore TM",
    BASEGEOGCRS["SVY21",
        DATUM["SVY21",
            ELLIPSOID["WGS 84",6378137,298.257223563,
                LENGTHUNIT["metre",1]]],
        PRIMEM["Greenwich",0,
            ANGLEUNIT["degree",0.0174532925199433]],
        ID["EPSG",4757]],
    CONVERSION["Singapore Transverse Mercator",
        METHOD["Transverse Mercator",
            ID["EPSG",9807]],
        PARAMETER["Latitude of natural origin",1.36666666666667,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8801]],
        PARAMETER["Longitude of natural origin",103.833333333333,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8802]],
        PARAMETER["Scale factor at natural origin",1,
            SCALEUNIT["unity",1],
            ID["EPSG",8805]],
        PARAMETER["False easting",28001.642,
            LENGTHUNIT["metre",1],
            ID["EPSG",8806]],
        PARAMETER["False northing",38744.572,
            LENGTHUNIT["metre",1],
            ID["EPSG",8807]]],
    CS[Cartesian,2],
        AXIS["northing (N)",north,
            ORDER[1],
            LENGTHUNIT["metre",1]],
        AXIS["easting (E)",east,
            ORDER[2],
            LENGTHUNIT["metre",1]],
    USAGE[
        SCOPE["Cadastre, engineering survey, topographic mapping."],
        AREA["Singapore - onshore and offshore."],
        BBOX[1.13,103.59,1.47,104.07]],
    ID["EPSG",3414]]
# Load in the mpsz data
mpsz = st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>% st_transform(3414)
Reading layer `MP14_SUBZONE_WEB_PL' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\InClassEx\InClassEx08\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
# plot on tmap
tmap_mode("view")
tmap mode set to interactive viewing
tm_shape(mpsz) +
    tmap_options(check.and.fix = TRUE) +
    tm_polygons(alpha = 0.4) +
tm_shape(test_data_pred) +
    tm_dots(col = "residuals",
            alpha = 0.6,
            style = "quantile")
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Variable(s) "residuals" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("plot")
tmap mode set to plotting